import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
air_satisfaction = pd.read_csv('air satisfaction.csv')
air_satisfaction.head()
| id | satisfaction_v2 | Gender | Customer Type | Age | Type of Travel | Class | Flight Distance | Seat comfort | Departure/Arrival time convenient | ... | Online support | Ease of Online booking | On-board service | Leg room service | Baggage handling | Checkin service | Cleanliness | Online boarding | Departure Delay in Minutes | Arrival Delay in Minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11112 | satisfied | Female | Loyal Customer | 65 | Personal Travel | Eco | 265 | 0 | 0 | ... | 2 | 3 | 3 | 0 | 3 | 5 | 3 | 2 | 0 | 0.0 |
| 1 | 110278 | satisfied | Male | Loyal Customer | 47 | Personal Travel | Business | 2464 | 0 | 0 | ... | 2 | 3 | 4 | 4 | 4 | 2 | 3 | 2 | 310 | 305.0 |
| 2 | 103199 | satisfied | Female | Loyal Customer | 15 | Personal Travel | Eco | 2138 | 0 | 0 | ... | 2 | 2 | 3 | 3 | 4 | 4 | 4 | 2 | 0 | 0.0 |
| 3 | 47462 | satisfied | Female | Loyal Customer | 60 | Personal Travel | Eco | 623 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 1 | 4 | 1 | 3 | 0 | 0.0 |
| 4 | 120011 | satisfied | Female | Loyal Customer | 70 | Personal Travel | Eco | 354 | 0 | 0 | ... | 4 | 2 | 2 | 0 | 2 | 4 | 2 | 5 | 0 | 0.0 |
5 rows × 24 columns
air_satisfaction.shape
(129880, 24)
air_satisfaction.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 129880 entries, 0 to 129879 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 129880 non-null int64 1 satisfaction_v2 129880 non-null object 2 Gender 129880 non-null object 3 Customer Type 129880 non-null object 4 Age 129880 non-null int64 5 Type of Travel 129880 non-null object 6 Class 129880 non-null object 7 Flight Distance 129880 non-null int64 8 Seat comfort 129880 non-null int64 9 Departure/Arrival time convenient 129880 non-null int64 10 Food and drink 129880 non-null int64 11 Gate location 129880 non-null int64 12 Inflight wifi service 129880 non-null int64 13 Inflight entertainment 129880 non-null int64 14 Online support 129880 non-null int64 15 Ease of Online booking 129880 non-null int64 16 On-board service 129880 non-null int64 17 Leg room service 129880 non-null int64 18 Baggage handling 129880 non-null int64 19 Checkin service 129880 non-null int64 20 Cleanliness 129880 non-null int64 21 Online boarding 129880 non-null int64 22 Departure Delay in Minutes 129880 non-null int64 23 Arrival Delay in Minutes 129487 non-null float64 dtypes: float64(1), int64(18), object(5) memory usage: 23.8+ MB
air_satisfaction = air_satisfaction.drop(columns=['id'])
air_satisfaction.columns = [col.replace(' ','_').replace('-','_').replace('/','_').lower() for col in air_satisfaction.columns]
categorical_columns = ['satisfaction_v2', 'gender', 'customer_type', 'type_of_travel', 'class',
'seat_comfort', 'seat_comfort', 'departure_arrival_time_convenient',
'food_and_drink', 'gate_location', 'inflight_wifi_service', 'inflight_entertainment',
'online_support', 'ease_of_online_booking', 'on_board_service', 'leg_room_service',
'baggage_handling', 'checkin_service', 'cleanliness', 'online_boarding']
for col in categorical_columns:
air_satisfaction[col] = air_satisfaction[col].astype('category')
air_satisfaction.head()
| satisfaction_v2 | gender | customer_type | age | type_of_travel | class | flight_distance | seat_comfort | departure_arrival_time_convenient | food_and_drink | ... | online_support | ease_of_online_booking | on_board_service | leg_room_service | baggage_handling | checkin_service | cleanliness | online_boarding | departure_delay_in_minutes | arrival_delay_in_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | satisfied | Female | Loyal Customer | 65 | Personal Travel | Eco | 265 | 0 | 0 | 0 | ... | 2 | 3 | 3 | 0 | 3 | 5 | 3 | 2 | 0 | 0.0 |
| 1 | satisfied | Male | Loyal Customer | 47 | Personal Travel | Business | 2464 | 0 | 0 | 0 | ... | 2 | 3 | 4 | 4 | 4 | 2 | 3 | 2 | 310 | 305.0 |
| 2 | satisfied | Female | Loyal Customer | 15 | Personal Travel | Eco | 2138 | 0 | 0 | 0 | ... | 2 | 2 | 3 | 3 | 4 | 4 | 4 | 2 | 0 | 0.0 |
| 3 | satisfied | Female | Loyal Customer | 60 | Personal Travel | Eco | 623 | 0 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 1 | 4 | 1 | 3 | 0 | 0.0 |
| 4 | satisfied | Female | Loyal Customer | 70 | Personal Travel | Eco | 354 | 0 | 0 | 0 | ... | 4 | 2 | 2 | 0 | 2 | 4 | 2 | 5 | 0 | 0.0 |
5 rows × 23 columns
air_satisfaction.isna().sum()
satisfaction_v2 0 gender 0 customer_type 0 age 0 type_of_travel 0 class 0 flight_distance 0 seat_comfort 0 departure_arrival_time_convenient 0 food_and_drink 0 gate_location 0 inflight_wifi_service 0 inflight_entertainment 0 online_support 0 ease_of_online_booking 0 on_board_service 0 leg_room_service 0 baggage_handling 0 checkin_service 0 cleanliness 0 online_boarding 0 departure_delay_in_minutes 0 arrival_delay_in_minutes 393 dtype: int64
missing = air_satisfaction['arrival_delay_in_minutes'].isnull().sum()
non_missing = air_satisfaction['arrival_delay_in_minutes'].notnull().sum()
values = [missing, non_missing]
labels = ['Missing', 'Non-Missing']
plt.pie(values, labels=labels, autopct='%1.1f%%')
plt.title(f'Missing vs Non-Missing Values in arrival_delay')
plt.show()
clean_air_satisfaction = air_satisfaction.dropna()
clean_air_satisfaction.isna().sum()
satisfaction_v2 0 gender 0 customer_type 0 age 0 type_of_travel 0 class 0 flight_distance 0 seat_comfort 0 departure_arrival_time_convenient 0 food_and_drink 0 gate_location 0 inflight_wifi_service 0 inflight_entertainment 0 online_support 0 ease_of_online_booking 0 on_board_service 0 leg_room_service 0 baggage_handling 0 checkin_service 0 cleanliness 0 online_boarding 0 departure_delay_in_minutes 0 arrival_delay_in_minutes 0 dtype: int64
sns.boxplot(x=clean_air_satisfaction['age'])
<Axes: xlabel='age'>
sns.boxplot(x=clean_air_satisfaction['flight_distance'])
<Axes: xlabel='flight_distance'>
sns.boxplot(x=clean_air_satisfaction['departure_delay_in_minutes'])
<Axes: xlabel='departure_delay_in_minutes'>
sns.boxplot(x=clean_air_satisfaction['arrival_delay_in_minutes'])
<Axes: xlabel='arrival_delay_in_minutes'>
sort_age = clean_air_satisfaction.sort_values(['age'], ascending=False)
sort_flight_distance = clean_air_satisfaction.sort_values(['flight_distance'], ascending=False)
sort_depart_delay = clean_air_satisfaction.sort_values(['departure_delay_in_minutes'], ascending=False)
sort_arrival_delay = clean_air_satisfaction.sort_values(['arrival_delay_in_minutes'], ascending=False)
print(f"Top 5\n {sort_age[['age']].head()}")
print(f"Top 5\n {sort_flight_distance[['flight_distance']].head()}")
print(f"Top 20\n {sort_depart_delay[['departure_delay_in_minutes']].head(20)}")
print(f"Top 20\n {sort_arrival_delay[['arrival_delay_in_minutes']].head(20)}")
Top 5
age
67916 85
92300 85
88349 85
54393 85
111268 85
Top 5
flight_distance
49083 6951
102409 6950
69690 6948
27712 6924
15497 6907
Top 20
departure_delay_in_minutes
9704 1592
122928 1305
17110 1128
103605 1017
3758 978
67029 951
99302 933
80827 930
52728 921
110991 859
73014 853
76216 815
3539 794
74383 756
127820 750
75159 748
73025 729
5801 726
5741 724
10883 692
Top 20
arrival_delay_in_minutes
9704 1584.0
122928 1280.0
17110 1115.0
103605 1011.0
3758 970.0
80827 952.0
67029 940.0
52728 924.0
99302 920.0
110991 860.0
73014 823.0
76216 822.0
3539 795.0
74383 748.0
127820 729.0
75159 720.0
73025 717.0
5741 705.0
10883 702.0
5801 691.0
clean_air_satisfaction = clean_air_satisfaction.query('departure_delay_in_minutes < 652 & departure_delay_in_minutes < 691')
clean_air_satisfaction
| satisfaction_v2 | gender | customer_type | age | type_of_travel | class | flight_distance | seat_comfort | departure_arrival_time_convenient | food_and_drink | ... | online_support | ease_of_online_booking | on_board_service | leg_room_service | baggage_handling | checkin_service | cleanliness | online_boarding | departure_delay_in_minutes | arrival_delay_in_minutes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | satisfied | Female | Loyal Customer | 65 | Personal Travel | Eco | 265 | 0 | 0 | 0 | ... | 2 | 3 | 3 | 0 | 3 | 5 | 3 | 2 | 0 | 0.0 |
| 1 | satisfied | Male | Loyal Customer | 47 | Personal Travel | Business | 2464 | 0 | 0 | 0 | ... | 2 | 3 | 4 | 4 | 4 | 2 | 3 | 2 | 310 | 305.0 |
| 2 | satisfied | Female | Loyal Customer | 15 | Personal Travel | Eco | 2138 | 0 | 0 | 0 | ... | 2 | 2 | 3 | 3 | 4 | 4 | 4 | 2 | 0 | 0.0 |
| 3 | satisfied | Female | Loyal Customer | 60 | Personal Travel | Eco | 623 | 0 | 0 | 0 | ... | 3 | 1 | 1 | 0 | 1 | 4 | 1 | 3 | 0 | 0.0 |
| 4 | satisfied | Female | Loyal Customer | 70 | Personal Travel | Eco | 354 | 0 | 0 | 0 | ... | 4 | 2 | 2 | 0 | 2 | 4 | 2 | 5 | 0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 129875 | satisfied | Female | disloyal Customer | 29 | Personal Travel | Eco | 1731 | 5 | 5 | 5 | ... | 2 | 2 | 3 | 3 | 4 | 4 | 4 | 2 | 0 | 0.0 |
| 129876 | neutral or dissatisfied | Male | disloyal Customer | 63 | Personal Travel | Business | 2087 | 2 | 3 | 2 | ... | 1 | 3 | 2 | 3 | 3 | 1 | 2 | 1 | 174 | 172.0 |
| 129877 | neutral or dissatisfied | Male | disloyal Customer | 69 | Personal Travel | Eco | 2320 | 3 | 0 | 3 | ... | 2 | 4 | 4 | 3 | 4 | 2 | 3 | 2 | 155 | 163.0 |
| 129878 | neutral or dissatisfied | Male | disloyal Customer | 66 | Personal Travel | Eco | 2450 | 3 | 2 | 3 | ... | 2 | 3 | 3 | 2 | 3 | 2 | 1 | 2 | 193 | 205.0 |
| 129879 | neutral or dissatisfied | Female | disloyal Customer | 38 | Personal Travel | Eco | 4307 | 3 | 4 | 3 | ... | 3 | 4 | 5 | 5 | 5 | 3 | 3 | 3 | 185 | 186.0 |
129466 rows × 23 columns
clean_air_satisfaction['sum_delay'] = clean_air_satisfaction.apply(lambda x :(x.departure_delay_in_minutes + x.arrival_delay_in_minutes),axis = 1)
clean_air_satisfaction[['departure_delay_in_minutes','arrival_delay_in_minutes','sum_delay']]
| departure_delay_in_minutes | arrival_delay_in_minutes | sum_delay | |
|---|---|---|---|
| 0 | 0 | 0.0 | 0.0 |
| 1 | 310 | 305.0 | 615.0 |
| 2 | 0 | 0.0 | 0.0 |
| 3 | 0 | 0.0 | 0.0 |
| 4 | 0 | 0.0 | 0.0 |
| ... | ... | ... | ... |
| 129875 | 0 | 0.0 | 0.0 |
| 129876 | 174 | 172.0 | 346.0 |
| 129877 | 155 | 163.0 | 318.0 |
| 129878 | 193 | 205.0 | 398.0 |
| 129879 | 185 | 186.0 | 371.0 |
129466 rows × 3 columns
clean_air_satisfaction[['customer_type','age']].groupby('customer_type').agg(['count','min','max','mean']).round(2)
| age | ||||
|---|---|---|---|---|
| count | min | max | mean | |
| customer_type | ||||
| Loyal Customer | 105754 | 7 | 85 | 41.47 |
| disloyal Customer | 23712 | 7 | 85 | 30.35 |
clean_air_satisfaction[['type_of_travel', 'satisfaction_v2', 'flight_distance']].groupby(['type_of_travel', 'satisfaction_v2']).agg(['count','min','max','mean']).round(2)
| flight_distance | |||||
|---|---|---|---|---|---|
| count | min | max | mean | ||
| type_of_travel | satisfaction_v2 | ||||
| Business travel | neutral or dissatisfied | 37229 | 50 | 6951 | 2044.93 |
| satisfied | 52202 | 50 | 6950 | 2079.99 | |
| Personal Travel | neutral or dissatisfied | 21360 | 50 | 6924 | 1989.91 |
| satisfied | 18675 | 50 | 6792 | 1565.18 | |
clean_air_satisfaction[['type_of_travel' ,'class', 'satisfaction_v2','flight_distance', 'sum_delay']].groupby(['type_of_travel', 'class', 'satisfaction_v2']).agg(['count','min','max','mean']).round(2)
| flight_distance | sum_delay | |||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| count | min | max | mean | count | min | max | mean | |||
| type_of_travel | class | satisfaction_v2 | ||||||||
| Business travel | Business | neutral or dissatisfied | 16580 | 51 | 6951 | 2254.50 | 16580 | 0.0 | 1151.0 | 35.13 |
| satisfied | 42733 | 50 | 6950 | 2166.07 | 42733 | 0.0 | 1239.0 | 25.65 | ||
| Eco | neutral or dissatisfied | 17679 | 50 | 6595 | 1881.55 | 17679 | 0.0 | 1166.0 | 35.39 | |
| satisfied | 7550 | 50 | 6816 | 1691.66 | 7550 | 0.0 | 1114.0 | 26.11 | ||
| Eco Plus | neutral or dissatisfied | 2970 | 54 | 6324 | 1847.54 | 2970 | 0.0 | 955.0 | 38.98 | |
| satisfied | 1919 | 52 | 6733 | 1690.99 | 1919 | 0.0 | 824.0 | 23.84 | ||
| Personal Travel | Business | neutral or dissatisfied | 1424 | 55 | 6865 | 1458.28 | 1424 | 0.0 | 678.0 | 36.36 |
| satisfied | 1240 | 51 | 6591 | 1353.44 | 1240 | 0.0 | 725.0 | 21.88 | ||
| Eco | neutral or dissatisfied | 17533 | 50 | 6924 | 2028.61 | 17533 | 0.0 | 1230.0 | 36.12 | |
| satisfied | 15347 | 50 | 6792 | 1587.60 | 15347 | 0.0 | 867.0 | 20.18 | ||
| Eco Plus | neutral or dissatisfied | 2403 | 50 | 6889 | 2022.57 | 2403 | 0.0 | 1137.0 | 36.08 | |
| satisfied | 2088 | 50 | 6598 | 1526.13 | 2088 | 0.0 | 239.0 | 19.67 | ||
clean_air_satisfaction[['gender', 'satisfaction_v2', 'flight_distance', 'sum_delay']].groupby(['gender', 'satisfaction_v2']).agg(['count', 'mean', 'median']).round(2)
| flight_distance | sum_delay | ||||||
|---|---|---|---|---|---|---|---|
| count | mean | median | count | mean | median | ||
| gender | satisfaction_v2 | ||||||
| Female | neutral or dissatisfied | 22894 | 1924.02 | 1888.0 | 22894 | 39.40 | 5.0 |
| satisfied | 42799 | 1823.67 | 1776.0 | 42799 | 23.91 | 1.0 | |
| Male | neutral or dissatisfied | 35695 | 2089.56 | 1989.0 | 35695 | 33.44 | 3.0 |
| satisfied | 28078 | 2128.30 | 2044.0 | 28078 | 24.71 | 1.0 | |
clean_air_satisfaction[['customer_type', 'satisfaction_v2', 'flight_distance', 'sum_delay']].groupby(['customer_type', 'satisfaction_v2']).agg(['count', 'mean', 'median']).round(2)
| flight_distance | sum_delay | ||||||
|---|---|---|---|---|---|---|---|
| count | mean | median | count | mean | median | ||
| customer_type | satisfaction_v2 | ||||||
| Loyal Customer | neutral or dissatisfied | 40565 | 2027.76 | 1959.0 | 40565 | 37.49 | 4.0 |
| satisfied | 65189 | 1936.47 | 1869.0 | 65189 | 24.13 | 1.0 | |
| disloyal Customer | neutral or dissatisfied | 18024 | 2018.38 | 1948.0 | 18024 | 31.90 | 3.0 |
| satisfied | 5688 | 2034.60 | 1985.0 | 5688 | 25.33 | 1.0 | |
col_list = ['age','flight_distance','departure_delay_in_minutes','arrival_delay_in_minutes']
sns.heatmap(clean_air_satisfaction[col_list].corr(),annot=True,fmt=".2f")
plt.show()
sns.countplot(x='satisfaction_v2',data=clean_air_satisfaction)
plt.show()
sns.countplot(x='customer_type',hue='satisfaction_v2',data=clean_air_satisfaction)
plt.show()
sns.kdeplot(data=clean_air_satisfaction, x="age",hue='satisfaction_v2',multiple="stack")
plt.show()
sns.catplot(data=clean_air_satisfaction, x="age", y="class",hue='satisfaction_v2',kind='box')
plt.show()
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.catplot(data=clean_air_satisfaction, x="satisfaction_v2", y="age", col="type_of_travel",kind="bar")
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
<seaborn.axisgrid.FacetGrid at 0x29dce390650>
satisfaction_count = clean_air_satisfaction['satisfaction_v2'].value_counts()
plt.pie(satisfaction_count, labels=satisfaction_count.index, autopct='%1.1f%%')
plt.title('Pie Chart of Satisfaction')
plt.show()
import plotly.express as px
fig = px.scatter(clean_air_satisfaction, x='flight_distance', y='sum_delay', color='satisfaction_v2')
fig.show()
sns.scatterplot(data=clean_air_satisfaction, x='flight_distance',y='sum_delay', hue='satisfaction_v2')
<Axes: xlabel='flight_distance', ylabel='sum_delay'>
C:\Users\porms\anaconda3\Lib\site-packages\IPython\core\events.py:93: UserWarning: Creating legend with loc="best" can be slow with large amounts of data. C:\Users\porms\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:152: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
sns.distplot(clean_air_satisfaction['flight_distance'])
plt.title('Histogram of flight distance' )
plt.xlabel('flight_distance')
C:\Users\porms\AppData\Local\Temp\ipykernel_15288\1216648016.py:1: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
Text(0.5, 0, 'flight_distance')
sns.pairplot(clean_air_satisfaction, hue="satisfaction_v2")
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
<seaborn.axisgrid.PairGrid at 0x25f0ce7a850>